### racial distinctiveness ratings ###
  # this file:
    # takes raw name ratings data
    # estimates name-trait ratings
    # outputs data in long and, for race, simplified forms
  # for the full list and subset of names 

#load required packages ####
library(ggplot2)
library(dplyr)
library(tidyverse)
library(matrixStats)
library(lme4)
library(R.utils)
library(janitor)
library(here)
summarize <- dplyr::summarize


### FIRST NAMES race and trait ratings ####

#read in raw ratings data
tab1 <- read.csv(here("comb_longratings.csv"))

#separate and remove last names
tab1 <- tab1 %>% separate(name, c("first", "last"), " ")
tab1$last <- NULL
#count ratings per name-trait pair
a <- tab1 %>% group_by(first, trait) %>% dplyr::summarize(n())
tab1 <- merge(tab1, a, by=c("first", "trait"))
#remove pairs with fewer than 15 ratings
tab1 <- tab1[tab1$`n()`>15&tab1$first!="",]


#create object to store results
traits <- unique(tab1$trait)
res <- t(as.data.frame(c(NA, NA, NA, NA)))
colnames(res) <- c("Estimate", "Std. Error","att", "name")

#estimate name coefs for each trait
## BEWARE: this takes quite a long time to run- up to several hours
for(i in 1:length(traits)){
  mod1 <- lmer(rate~first - 1 + (1|rid), data=tab1[tab1$trait==traits[i],], na.action="na.omit")
  mod1s <- summary(mod1)
  mod1s <- as.data.frame(mod1s$coefficients[,1:2])
  mod1s$att <- traits[i]
  mod1s$name <- rownames(mod1s)
  res <- rbind(res, mod1s)
  print(i)
}

#format for export
res$name <- str_sub(res$name, 6, str_length(res$name))
res <- res[-1,]

#export long-format data
write.csv(res, here("comb_traitratings_withSEs.csv"), row.names = FALSE)

#pivot and export wide-format data
wideres <- res %>% pivot_wider(-`Std. Error`, names_from = att, values_from=Estimate)
write.csv(wideres, here("comb_traitratings.csv"), row.names = FALSE)



### create wide-format race data merged with name demographics ####

#read in name demographics
tab2 <- read.csv(here("firstnames.csv"))
  #-firstnames.csv is Tzioumis et al.'s data on name race
  #data downloaded from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TYJKEZ
  #saved second sheet ("Data") as a .csv called "firstnames.csv"
colnames(tab2)[1] <- "first"
tab2 <- select(tab2, first, pctwhite, pcthispanic, pctblack, pctapi)
tab2$first <- capitalize(tolower(tab2$first))

#read in wide-format data
tab3 <- read.csv("comb_traitratings.csv")

#merge name demographics and estimates
tab3 <- merge(tab3, tab2, by.x="name", by.y="first", all.x=T)

#extract order of races for each name 
tab3$racerank <- rowRanks(as.matrix(select(tab3, African.American, Asian, Hispanic, white)))
tab3 <- tab3 %>% mutate(race = case_when(racerank[,1]==4~"black",
                                         racerank[,2]==4~"asian", 
                                         racerank[,3]==4~"hispanic",
                                         racerank[,4]==4~"white"))
tab3 <- tab3[!is.na(tab3$race),]
#get first and second most-typical-rated races
v1 <- NA
v2 <- NA
for(i in 1:nrow(tab3)){
  v1[i] <- select(tab3, African.American, Asian, Hispanic, white)[i,tab3$racerank[i,]==4]
  v2[i] <- select(tab3, African.American, Asian, Hispanic, white)[i,tab3$racerank[i,]==3]
}
v2 <- unlist(v2)

#subtract for distinctiveness score
tab3$distinct <- v1-v2
#remove race ranking variable
tab3$racerank <- NULL

#add gender using NC data
tab1 <- read.csv(here("nc_voters.csv"))

tab3 <- merge(tab3, select(tab1, first_name, pct_f), by.x="name", by.y="first_name", all.x=T)
tab3 <- tab3 %>% mutate(gender = case_when(pct_f>.5~"F",
                                           pct_f<.5~"M"))

tab3 <- tab3 %>% 
  select(name, African.American:white, a.man, a.woman, pctwhite:gender)

#export wide-format data
write.csv(tab3, "nameratings.csv", row.names = FALSE)

#clear data
rm(list=ls())



# LAST NAMES ####
tab1 <- read.csv(here("comb_lastnameratings.csv"))

a <- tab1 %>% group_by(name, trait) %>% dplyr::summarize(n())
tab1 <- merge(tab1, a, by=c("name", "trait"))
tab1 <- tab1[tab1$`n()`>15&tab1$name!="",]
tab1 <- tab1 %>% separate(name, c("first", "last"), " ")
tab1 <- tab1[!is.na(tab1$last),]

#create object to store results
traits <- unique(tab1$trait)
res <- t(as.data.frame(c(NA, NA, NA, NA)))
colnames(res) <- c("Estimate", "Std. Error","att", "name")

#estimate name coefs for each trait
for(i in 1:length(traits)){
  mod1 <- lmer(rate~last - 1 + (1|rid), data=tab1[tab1$trait==traits[i],], na.action="na.omit")
  mod1s <- summary(mod1)
  mod1s <- as.data.frame(mod1s$coefficients[,1:2])
  mod1s$att <- traits[i]
  mod1s$name <- rownames(mod1s)
  res <- rbind(res, mod1s)
  print(i)
}

#format for export
res$name <- str_sub(res$name, 5, str_length(res$name))
res <- res[-1,]

#export long-format data
write.csv(res, here("lastname_traitratings_withSEs.csv"), row.names = FALSE)

#pivot and export wide-format data
wideres <- res %>% pivot_wider(-`Std. Error`, names_from = att, values_from=Estimate)
write.csv(wideres, here("lastname_traitratings.csv"), row.names = FALSE)

rm(list=ls())

## COMBINED FIRST AND LAST RATINGS ####

#read in raw ratings data
tab1 <- read.csv("comb_longratings.csv")

#separate and remove those without last names
tab1 <- tab1 %>% separate(name, c("first", "last"), " ", remove=F)
tab1 <- tab1[!is.na(tab1$last),]
#count ratings per name-trait pair
a <- tab1 %>% group_by(name, trait) %>% dplyr::summarize(n())
tab1 <- merge(tab1, a, by=c("name", "trait"))
#remove pairs with fewer than 15 ratings
tab1 <- tab1[tab1$`n()`>15&tab1$name!="",]


#create object to store results
traits <- unique(tab1$trait)
res <- t(as.data.frame(c(NA, NA, NA, NA)))
colnames(res) <- c("Estimate", "Std. Error","att", "name")

#estimate name coefs for each trait
for(i in 1:length(traits)){
  mod1 <- lmer(rate~name - 1 + (1|rid), data=tab1[tab1$trait==traits[i],], na.action="na.omit")
  mod1s <- summary(mod1)
  mod1s <- as.data.frame(mod1s$coefficients[,1:2])
  mod1s$att <- traits[i]
  mod1s$name <- rownames(mod1s)
  res <- rbind(res, mod1s)
  print(i)
}

#format for export
res$name <- str_sub(res$name, 5, str_length(res$name))
res <- res[-1,]
wideres <- res %>% pivot_wider(-`Std. Error`, names_from = att, values_from=Estimate) %>%
  clean_names()

#merge with last names 

tab2 <- read.csv("lastname_traitratings.csv")
tab2 <- tab2 %>% clean_names()
wideres <- bind_rows(tab2, wideres)

wideres <- wideres %>% separate(name, c("first", "last"), " ", remove=F)
wideres$lastonly <- is.na(wideres$last)
wideres <- wideres %>% 
  mutate(last = case_when(lastonly==T~first,
                          T~last)) %>%
  mutate(first = case_when(lastonly==T~NA_character_,
                           T~first)) %>%
  select(-name, -lastonly) %>%
  arrange(last)

write.csv(wideres, "firstlast_ratings.csv", row.names = FALSE)


